Exploring CxE metadata and weather data

Available files (in the data-generated folder and subfolders:

  • miappe_metadata.ttl: RDF metadata for 5 CE experiments
  • MIAPPE_CxE_v1.1.xlsx: Excel file with data and metadata - has been converted to miappe-metadata.ttl
  • station_metadata.ttl: information about the "weather stations" with the weather (mean day temperature, hours of sunlight) for the 5 CE experiments, but no actual data
  • in the weather folder:
    • Files contain measurements for hours of sunlight and daily temperature at each location
    • all_weather.ttl combines the content of all files in the weather folder.
  • in the phenotypic folder:
    • Files contain measurements phenotypic measurements for the 5 experiments: data_199NL.ttl, data_2003VE.ttl, data_2004Fin.ttl, data_2005Fin.ttl, data_2010ET.ttl

(Setup and helper functions:)

In [1]:
# setup
import os
import random
import numpy as np
import numpy.polynomial.polynomial as poly
import rdflib
import requests
import SPARQLWrapper
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
#from matplotlib.ticker import AutoMinorLocator
import plotly.express as px
import plotly.graph_objects as go
#from numpy.polynomial import Polynomial

pd.set_option('display.max_colwidth', -1)  # or 199
data_loc = 'data-generated' + os.sep
data_weather = data_loc + 'weather' + os.sep
data_pheno = data_loc + 'phenotypic' + os.sep
In [2]:
# helper functions: query_DB, formula_btt, calculate_PBTT
def query_DB(sparql_query):
    '''
    Query the endpoint with the given query string and return the results as a pandas Dataframe.
    '''
    # create the connection to the endpoint
    sparql_service_url = 'http://fuseki:3030/pheno/query'
    #sparql_service_url = 'http://localhost:3030/pheno/query'
    sparql = SPARQLWrapper.SPARQLWrapper(sparql_service_url, agent='Sparql Wrapper on Jupyter example')  
    
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(SPARQLWrapper.JSON)

    # ask for the result
    result = sparql.query().convert()
    return json_normalize(result['results']['bindings'])


def formula_btt(t):
    # fixed values based on potato
    tb = 5.5
    to = 23.4
    tc = 34.6
    ct = 1.7 
    btt = (
            ((tc - t) / (tc - to)) * 
            ((t - tb) / (to - tb)) ** 
            ((to - tb) / (tc - to))
           ) ** ct
    try:
        return btt if btt > 0 else 0
    except:
        return 0
    

def calculate_PBTT(df_w_cur):    
    df_w_cur = df_w_cur.dropna()
    df_w_cur['mean_temperature'] = pd.to_numeric(df_w_cur['mean_temperature'])
    df_w_cur['hours_of_sunlight'] = pd.to_numeric(df_w_cur['hours_of_sunlight'])
    #df_w_cur = df_w_cur.where(pd.notnull(df_w_cur), None)

    df_w_cur['BTT'] = df_w_cur.apply(
        lambda row: formula_btt(row.mean_temperature), 
        axis = 1) 
    df_w_cur['day_fraction'] = df_w_cur.apply(
        lambda row: row.hours_of_sunlight / 24.0,
        axis = 1)
    df_w_cur['PBTT'] = df_w_cur.apply(
        lambda row: (row.BTT * row.day_fraction) if row.BTT > 0 else 0,
        axis = 1) 
    df_w_cur['cum_PBTT'] = df_w_cur['PBTT'].cumsum()
    cum_pbtt = df_w_cur.loc[df_w_cur.index[-1], 'cum_PBTT']
    return df_w_cur, cum_pbtt


def autolabel(rects):
    '''
    Attach a text label above each bar displaying its height
    taken from: from https://stackoverflow.com/questions/30228069/how-to-display-the-value-of-the-bar-on-each-bar-with-pyplot-barh
    '''
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                round(height,2),
                ha='center', va='bottom')
        
        
def non_increasing(L):
    return all(x>=y for x, y in zip(L, L[1:]))

def non_decreasing(L):
    return all(x<=y for x, y in zip(L, L[1:]))

def monotonic(L):
    return non_increasing(L) or non_decreasing(L)

Exploring the Investigation and Study metadata

We can start with the top MIAPPE object, the Investigation, and examine the properties it has:

In [3]:
# fetch investigation properties
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#> 

    SELECT distinct ?investigationProperty
    WHERE {
        ?inv a ppeo:investigation .
        ?inv ?investigationProperty ?_
    }
''')
print('Investigation properties:')
result = query_DB(datatable_query)
columnsToKeep = ['investigationProperty.value']
df_inv = result.loc[:, columnsToKeep]
for c in columnsToKeep:
    df_inv.rename(inplace=True, columns={c: c[:-6]})
df_inv
Investigation properties:
Out[3]:
investigationProperty
0 http://www.w3.org/1999/02/22-rdf-syntax-ns#type
1 http://purl.org/ppeo/PPEO.owl#hasName
2 http://purl.org/ppeo/PPEO.owl#hasAssociatedPublication
3 http://purl.org/ppeo/PPEO.owl#hasDescription
4 http://purl.org/ppeo/PPEO.owl#hasIdentifier
5 http://purl.org/ppeo/PPEO.owl#hasLicense
6 http://purl.org/ppeo/PPEO.owl#hasMIAPPEVersion
7 http://purl.org/ppeo/PPEO.owl#hasPart
8 http://purl.org/ppeo/PPEO.owl#hasPersonWithRole
9 http://purl.org/ppeo/PPEO.owl#hasPublicReleaseDate
10 http://purl.org/ppeo/PPEO.owl#hasSubmissionDate

Next, we can check the values of those properties. For those that are not plain literals, we also choose to show their class here (if given):

In [4]:
# fetch investigation properties and values (no reasoning for transitivity)
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#> 

    SELECT distinct ?investigationProperty ?target ?type
    WHERE {
        ?inv a ppeo:investigation ;
             ?investigationProperty ?target .
        OPTIONAL {?target a ?type}
    } 
    ORDER BY ?investigationProperty
''')
result = query_DB(datatable_query)
columnsToKeep = ['investigationProperty.value', 
                 'target.value', 'type.value']
df_inv = result.loc[:, columnsToKeep]
for c in columnsToKeep:
    df_inv.rename(inplace=True, columns={c: c[:-6]})
df_inv
Out[4]:
investigationProperty target type
0 http://purl.org/ppeo/PPEO.owl#hasAssociatedPublication https://library.wur.nl/WebQuery/wurpubs/fulltext/240586 NaN
1 http://purl.org/ppeo/PPEO.owl#hasDescription FAIRified, partial data from the 2012 thesis of Paula Ximena Hurtado Lopez NaN
2 http://purl.org/ppeo/PPEO.owl#hasIdentifier WUR_inv_CE2020 NaN
3 http://purl.org/ppeo/PPEO.owl#hasLicense CC-BY 4.0 NaN
4 http://purl.org/ppeo/PPEO.owl#hasMIAPPEVersion 1.1 NaN
5 http://purl.org/ppeo/PPEO.owl#hasName Investigating genotype by environment and QTL by environment interactions for developmental traits in potato NaN
6 http://purl.org/ppeo/PPEO.owl#hasPart http://localhost:3030/pheno2/data#study/WUR_inv_CE2020_1999NL http://purl.org/ppeo/PPEO.owl#study
7 http://purl.org/ppeo/PPEO.owl#hasPart http://localhost:3030/pheno2/data#study/WUR_inv_CE2020_2003VE http://purl.org/ppeo/PPEO.owl#study
8 http://purl.org/ppeo/PPEO.owl#hasPart http://localhost:3030/pheno2/data#study/WUR_inv_CE2020_2004Fin http://purl.org/ppeo/PPEO.owl#study
9 http://purl.org/ppeo/PPEO.owl#hasPart http://localhost:3030/pheno2/data#study/WUR_inv_CE2020_2005Fin http://purl.org/ppeo/PPEO.owl#study
10 http://purl.org/ppeo/PPEO.owl#hasPart http://localhost:3030/pheno2/data#study/WUR_inv_CE2020_2010ET http://purl.org/ppeo/PPEO.owl#study
11 http://purl.org/ppeo/PPEO.owl#hasPersonWithRole http://localhost:3030/pheno2/data#person_role/WUR_inv_CE2020_2010ET_coodinator NaN
12 http://purl.org/ppeo/PPEO.owl#hasPersonWithRole http://localhost:3030/pheno2/data#person_role/WUR_inv_CE2020_2010ET_corresponding_author NaN
13 http://purl.org/ppeo/PPEO.owl#hasPersonWithRole http://localhost:3030/pheno2/data#person_role/WUR_inv_CE2020_2010ET_data_author NaN
14 http://purl.org/ppeo/PPEO.owl#hasPersonWithRole http://localhost:3030/pheno2/data#person_role/WUR_inv_CE2020_2010ET_scientist NaN
15 http://purl.org/ppeo/PPEO.owl#hasPersonWithRole http://localhost:3030/pheno2/data#person_role/WUR_inv_CE2020_2010ET_scientist__coordinator NaN
16 http://purl.org/ppeo/PPEO.owl#hasPublicReleaseDate 2020-01-01 NaN
17 http://purl.org/ppeo/PPEO.owl#hasSubmissionDate 2020-01-01 NaN
18 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://purl.org/ppeo/PPEO.owl#investigation NaN

We can choose to take a look at studies, the next object in the MIAPPE hierarchy, and look at some of their properties.

In [5]:
# fetch study properties
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#> 

    SELECT distinct ?studyId ?cntName ?addr ?locName ?startDate ?endDate ?lat ?long ?alt 
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier ?studyId ;
               ppeo:hasStartDateTime ?startDate ;
               ppeo:hasEndDateTime ?endDate ;
               ppeo:hasLocation ?studyloc .

        ?studyloc ppeo:hasCountry ?cnt ;
                  ppeo:hasLatitude ?lat ;
                  ppeo:hasLongitude ?long ;
                  ppeo:hasAltitude ?alt ;
                  ppeo:hasAddress ?addr ;
                  ppeo:hasName ?locName .
        ?cnt ppeo:hasName ?cntName .
    }
    ORDER BY ?studyId
''')
result = query_DB(datatable_query)
columnsToKeep = ['studyId.value', 'cntName.value', 'locName.value', 'addr.value', 'startDate.value', 
                 'endDate.value', 'lat.value', 'long.value', 'alt.value']
df_studies = result.loc[:, columnsToKeep]
for c in columnsToKeep:
    df_studies.rename(inplace=True, columns={c: c[:-6]})
df_studies
Out[5]:
studyId cntName locName addr startDate endDate lat long alt
0 1999NL NL Wageningen WUR field 1999-03-12 1999-11-14 52.0 5.63 11m
1 2003VE VE Merida Mucupiche field in La Fresa 2003-06-24 2003-11-30 8.6 -71.13 2200m
2 2004Fin FI Ruukki Field in North Ostrobothnia Research Station 2004-04-16 2004-11-10 64.7 25.0 48m
3 2005Fin FI Ruukki Field in North Ostrobothnia Research Station 2005-05-03 2005-11-14 64.7 25.0 48m
4 2010ET ET Holeta Field in Holeta Agricultural Research Institute 2010-07-16 2010-12-06 9.12 38.13 2400m

Verifying that the experiments have overlapping genotype sets

We can easily find the genotypes (biological materials) that were part of all 5 studies:

In [6]:
# common genotypes across 5 studies (intersection across experiments)
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#> 

    SELECT distinct ?bio_mat_id
    WHERE {
        ?study1 a ppeo:study ;
            ppeo:hasBiologicalMaterial ?bio_mat ;
            ppeo:hasIdentifier '1999NL' .

        ?study2 a ppeo:study ;
            ppeo:hasBiologicalMaterial ?bio_mat ;
            ppeo:hasIdentifier '2005Fin' .

       ?study3 a ppeo:study ;
            ppeo:hasBiologicalMaterial ?bio_mat ;
            ppeo:hasIdentifier '2003VE' .

       ?study4 a ppeo:study ;
            ppeo:hasBiologicalMaterial ?bio_mat ;
            ppeo:hasIdentifier '2004Fin' .

       ?study5 a ppeo:study ;
            ppeo:hasBiologicalMaterial ?bio_mat ;
            ppeo:hasIdentifier '2010ET'  .

        ?bio_mat ppeo:hasIdentifier ?bio_mat_id .

    } 
    ORDER BY ?bio_mat_id
''')
result = query_DB(datatable_query)
columnsToKeep = ['bio_mat_id.value']
df_geno = result.loc[:, columnsToKeep]
print(str(len(df_geno)) + ' genotypes overlap:')
for c in columnsToKeep:
    df_geno.rename(inplace=True, columns={c: c[:-6]})
df_geno.head()
101 genotypes overlap:
Out[6]:
bio_mat_id
0 WUR:CE011
1 WUR:CE032
2 WUR:CE047
3 WUR:CE067
4 WUR:CE070

We can also calculate the total number of genotypes, i.e. the ones that were studied in at least one experiment:

In [7]:
# all genotypes (union across experiments)
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#> 

    SELECT distinct ?bio_mat_id
    WHERE {
        ?study1 a ppeo:study ;
            ppeo:hasBiologicalMaterial ?bio_mat .
        ?bio_mat ppeo:hasIdentifier ?bio_mat_id .

    } 
    ORDER BY ?bio_mat_id
''')
result = query_DB(datatable_query)
columnsToKeep = ['bio_mat_id.value']
df_geno = result.loc[:, columnsToKeep]
print(str(len(df_geno)) + ' genotypes in total:')
for c in columnsToKeep:
    df_geno.rename(inplace=True, columns={c: c[:-6]})
df_geno.head()
292 genotypes in total:
Out[7]:
bio_mat_id
0 WUR:ASTARTE
1 WUR:AWASH
2 WUR:BELETE
3 WUR:BINTJE
4 WUR:BULLE
In [8]:
# genotypes per experiment
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#> 

    SELECT (?study_id as ?Study_ID) (count(distinct ?bio_mat_id) as ?genotypes)
    WHERE {
        ?study a ppeo:study ;
            ppeo:hasBiologicalMaterial ?bio_mat ;
            ppeo:hasIdentifier ?study_id .
        ?bio_mat ppeo:hasIdentifier ?bio_mat_id .
    } GROUP BY ?study_id

''')
result = query_DB(datatable_query)
columnsToKeep = ['Study_ID.value', 'genotypes.value']
df_geno = result.loc[:, columnsToKeep]
print('Genotypes per experiment:')
for c in columnsToKeep:
    df_geno.rename(inplace=True, columns={c: c[:-6]})
df_geno.head()
Genotypes per experiment:
Out[8]:
Study_ID genotypes
0 2005Fin 260
1 2003VE 107
2 2010ET 190
3 2004Fin 251
4 1999NL 242

Checking if there is weather data available for the location of interest

In [9]:
# pull stations from /weather with service
datatable_query = (
'''
    PREFIX aemet: <http://aemet.linkeddata.es/ontology/> 
    PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> 
    PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#> 
    PREFIX w3ctime: <http://www.w3.org/2006/time#> 
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#> 
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT distinct ?name ?latitude ?longitude ?altitude
      WHERE {
        SERVICE <http://localhost:3030/weather/query> {
            ?ws a aemet:WeatherStation ;
                aemet:stationName ?name ;
                geo:location ?loc .
            ?loc geo:lat ?latitude ;
                 geo:long ?longitude ;
                geo:alt ?altitude .
        }
    } ORDER BY ASC(?obsTime)
''')
result = query_DB(datatable_query)
columnsToKeep = ['name.value', 'latitude.value', 
                 'longitude.value', 'altitude.value']
df_stations = result[columnsToKeep]
for c in columnsToKeep:
    df_stations.rename(inplace=True, columns={c: c[:-6]})
df_stations
C:\Python37-32\lib\site-packages\pandas\core\frame.py:4025: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
Out[9]:
name latitude longitude altitude
0 WUR station 52 5.63 11
1 Merida station 8.6 -71.13 2200
2 Holeta station 9.12 38 2400
3 Ruuki station 64.7 25 48
In [10]:
# pull NL sunhours from /weather with service
datatable_query = (
'''
    PREFIX aemet: <http://aemet.linkeddata.es/ontology/> 
    PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> 
    PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#> 
    PREFIX w3ctime: <http://www.w3.org/2006/time#> 
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#> 
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT ?obsTime ?obsVal
      WHERE {
        SERVICE <http://localhost:3030/weather/query> {
          ?obs a aemet:Observation ;
            aemet:valueOfObservedData ?obsVal ;
            aemet:observedInInterval ?obsTime ;
            ssn:observedProperty ?var ;
            ssn:observedBy ?station .

          ?station aemet:stationName ?obsStation .
          FILTER("WUR station"=?obsStation) .
          FILTER(?obsTime > "1999-05-11"^^xsd:date && 
                 ?obsTime < "1999-11-15"^^xsd:date)

          ?var a aemet:AmbientProperty ;
               rdfs:label "Hours of sunlight" .
        }
    } ORDER BY ASC(?obsTime)
''')
result = query_DB(datatable_query)
columnsToKeep = ['obsTime.value', 'obsVal.value']
df_nl_sun = result[columnsToKeep]
for c in columnsToKeep:
    df_nl_sun.rename(inplace=True, columns={c: c[:-6]})
display(df_nl_sun.head())
obsTime obsVal
0 1999-05-12 15.54
1 1999-05-12 15.54
2 1999-05-13 15.59
3 1999-05-13 15.59
4 1999-05-14 15.64

Plot (hours of sunlight):

In [11]:
# make plot

df_nl_sun.loc[:, 'obsTime'] = pd.to_datetime(df_nl_sun['obsTime'])
df_nl_sun.loc[:, 'obsVal'] = pd.to_numeric(df_nl_sun['obsVal'], downcast="float")
    
pd.plotting.register_matplotlib_converters()
fig, ax = plt.subplots(figsize=(10, 5))

ax.plot(df_nl_sun['obsTime'], df_nl_sun['obsVal'])
ax.set_title('Daily hours of sunlight (NL)')
plt.ylabel('Hours per day')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1)) 
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
C:\Python37-32\lib\site-packages\pandas\core\indexing.py:635: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
C:\Python37-32\lib\site-packages\pandas\core\indexing.py:543: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
In [12]:
# check distances from experimental locations to weather station coordinates
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#> 
    PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
    PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> 
    PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#> 
    PREFIX w3ctime: <http://www.w3.org/2006/time#> 
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT distinct ?studyId ?cntName ?stu_locName ?stu_lat ?stu_long ?w_name ?w_lat ?w_long ?dist_sq
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier ?studyId ;
               ppeo:hasLocation ?studyloc .

        ?studyloc ppeo:hasCountry ?cnt ;
                  ppeo:hasLatitude ?stu_lat ;
                  ppeo:hasLongitude ?stu_long ;
                  ppeo:hasName ?stu_locName .
        ?cnt ppeo:hasName ?cntName .
  
        SERVICE <http://localhost:3030/weather/query> {
            ?ws a aemet:WeatherStation ;
                aemet:stationName ?w_name ;
                geo:location ?w_loc .
            ?w_loc geo:lat ?w_lat ; geo:long ?w_long .
        }
        
      BIND(((?stu_lat - ?w_lat) * (?stu_lat - ?w_lat)) AS ?lat_diff_sq)
      BIND(((?stu_long - ?w_long) * (?stu_long - ?w_long)) AS ?long_diff_sq)
      BIND((?lat_diff_sq + ?long_diff_sq) AS ?dist_sq)
    }
    ORDER BY ?dist_sq
''')
result = query_DB(datatable_query)
columnsToKeep = ['studyId.value', 'cntName.value', 'stu_locName.value', 'stu_lat.value', 'stu_long.value', 
                 'w_name.value', 'w_lat.value', 'w_long.value', 'dist_sq.value']
df_distances = result[columnsToKeep]
for c in columnsToKeep:
    df_distances.rename(inplace=True, columns={c: c[:-6]})
display(df_distances)
studyId cntName stu_locName stu_lat stu_long w_name w_lat w_long dist_sq
0 2004Fin FI Ruukki 64.7 25.0 Ruuki station 64.7 25 0.0
1 2005Fin FI Ruukki 64.7 25.0 Ruuki station 64.7 25 0.0
2 1999NL NL Wageningen 52.0 5.63 WUR station 52 5.63 0.0
3 2003VE VE Merida 8.6 -71.13 Merida station 8.6 -71.13 0.0
4 2010ET ET Holeta 9.12 38.13 Holeta station 9.12 38 0.016900279
5 2004Fin FI Ruukki 64.7 25.0 WUR station 52 5.63 536.4868
6 2005Fin FI Ruukki 64.7 25.0 WUR station 52 5.63 536.4868
7 1999NL NL Wageningen 52.0 5.63 Ruuki station 64.7 25 536.4868
8 1999NL NL Wageningen 52.0 5.63 Holeta station 9.12 38 2886.5112
9 2010ET ET Holeta 9.12 38.13 WUR station 52 5.63 2894.9443
10 2004Fin FI Ruukki 64.7 25.0 Holeta station 9.12 38 3258.1362
11 2005Fin FI Ruukki 64.7 25.0 Holeta station 9.12 38 3258.1362
12 2010ET ET Holeta 9.12 38.13 Ruuki station 64.7 25 3261.5332
13 1999NL NL Wageningen 52.0 5.63 Merida station 8.6 -71.13 7775.6567
14 2003VE VE Merida 8.6 -71.13 WUR station 52 5.63 7775.6567
15 2003VE VE Merida 8.6 -71.13 Holeta station 9.12 38 11909.627
16 2010ET ET Holeta 9.12 38.13 Merida station 8.6 -71.13 11938.017
17 2004Fin FI Ruukki 64.7 25.0 Merida station 8.6 -71.13 12388.187
18 2005Fin FI Ruukki 64.7 25.0 Merida station 8.6 -71.13 12388.187
19 2003VE VE Merida 8.6 -71.13 Ruuki station 64.7 25 12388.187

Focusing on a trait and plotting it for all experiments, for each genotype

Reading data files for each study

We can view the variables in a data file for a study as follows:

Netherlands 1999 data

In [13]:
# variables for 1999NL
datatable_query = ('''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
    SELECT distinct ?filename ?varName 
    WHERE {
        ?file ppeo:hasObservation ?obs ;
           ppeo:hasDigitalLocation ?filename .
        ?study ppeo:hasDataFile ?file ;
               ppeo:hasIdentifier '1999NL' .
        ?obs ppeo:hasVariable ?var .
        ?study ppeo:hasPart ?var .
        ?var ppeo:hasIdentifier ?varName .   
    }
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'varName.value']
df_xvars = result[columnsToKeep]
for c in columnsToKeep:
    df_xvars.rename(inplace=True, columns={c: c[:-6]})
df_xvars
Out[13]:
filename varName
0 data_1999NL.csv TubN_average_per_genotype
1 data_1999NL.csv TubN_total_per_plant
2 data_1999NL.csv TubW_average_per_genotype
3 data_1999NL.csv TubW_total_per_plant

Create the data table for this study. We will also add a column to hold the genotypes for later use.

(The code for putting SPARQLWrapper output into a dataframe is based on this page)

In [14]:
# 1999NL data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
    SELECT DISTINCT ?filename ?date ?ouID ?bmID ?TubN_total_per_plant ?TubW_total_per_plant # ?TubN_average_per_genotype ?TubW_average_per_genotype
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier '1999NL' ;
               ppeo:hasDataFile ?file .

        ?file a ppeo:data_file ;
              ppeo:hasDigitalLocation ?filename .

        ?file ppeo:hasObservation ?obs1 .
        ?var1 a ppeo:observed_variable ;
                ppeo:hasIdentifier 'TubN_total_per_plant' .
        ?obs1 ppeo:hasVariable ?var1 ;
              ppeo:hasDateTime ?date ;
              ppeo:hasObservedSubject ?ou ;
              ppeo:hasValue ?TubN_total_per_plant .

        OPTIONAL {?file ppeo:hasObservation ?obs2 .
                  ?var2 a ppeo:observed_variable ;
                        ppeo:hasIdentifier 'TubW_total_per_plant' .
                  ?obs2 ppeo:hasVariable ?var2 ;
                        ppeo:hasDateTime ?date ;
                         ppeo:hasObservedSubject ?ou ;
                         ppeo:hasValue ?TubW_total_per_plant .}

#        OPTIONAL {?file ppeo:hasObservation ?obs3 .
#                  ?var3 a ppeo:observed_variable ;
#                        ppeo:hasIdentifier 'TubN_average_per_genotype' .
#                  ?obs3 ppeo:hasVariable ?var3 ;
#                        ppeo:hasDateTime ?date ;
#                         ppeo:hasObservedSubject ?ou ;
#                         ppeo:hasValue ?TubN_average_per_genotype .}
#
#        OPTIONAL {?file ppeo:hasObservation ?obs4 .
#                  ?var4 a ppeo:observed_variable ;
#                        ppeo:hasIdentifier 'TubW_average_per_genotype' .
#                  ?obs4 ppeo:hasVariable ?var4 ;
#                        ppeo:hasDateTime ?date ;
#                        ppeo:hasObservedSubject ?ou ;
#                        ppeo:hasValue ?TubW_average_per_genotype .}

        ?ou a ppeo:observation_unit ;
            ppeo:hasIdentifier ?ouID ;
            ppeo:hasBiologicalMaterial ?bm .
        ?bm ppeo:hasIdentifier ?bmID .
    }
    ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value', 'bmID.value', 
                 'TubN_total_per_plant.value', 'TubW_total_per_plant.value']
datatable_1999NL = result[columnsToKeep]
for c in columnsToKeep:
    datatable_1999NL.rename(inplace=True, columns={c: c[:-6]})

datatable_1999NL.loc[:, 'TubW_total_per_plant'] = pd.to_numeric(datatable_1999NL['TubW_total_per_plant'], downcast="float")
datatable_1999NL_perPlant = datatable_1999NL.groupby('bmID')['TubW_total_per_plant'].mean()
display(datatable_1999NL.head())
display(datatable_1999NL.describe())
filename date ouID bmID TubN_total_per_plant TubW_total_per_plant
0 data_1999NL.csv 1999-11-14 WUR:CE-NL99-WUR:CE011-1 WUR:CE011 6.1e+01 561.559998
1 data_1999NL.csv 1999-11-14 WUR:CE-NL99-WUR:CE011-2 WUR:CE011 4.6e+01 1212.630005
2 data_1999NL.csv 1999-11-14 WUR:CE-NL99-WUR:CE016-1 WUR:CE016 4.9e+01 2637.739990
3 data_1999NL.csv 1999-11-14 WUR:CE-NL99-WUR:CE016-2 WUR:CE016 2.1e+01 608.919983
4 data_1999NL.csv 1999-11-14 WUR:CE-NL99-WUR:CE017-1 WUR:CE017 2.5e+01 395.899994
TubW_total_per_plant
count 452.000000
mean 1380.570801
std 794.384644
min 24.910000
25% 715.117477
50% 1330.800049
75% 1877.225006
max 4220.200195
In [15]:
# calculate average weight per genotype
# 1999NL data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>

    SELECT DISTINCT  ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier '1999NL' ;
               ppeo:hasDataFile ?file .

        ?file a ppeo:data_file ;
              ppeo:hasDigitalLocation ?filename ;
              ppeo:hasObservation ?obs1 .

        ?var1 a ppeo:observed_variable ;
              ppeo:hasIdentifier 'TubW_total_per_plant' .

        ?obs1 ppeo:hasVariable ?var1 ;
              ppeo:hasValue ?TubW_total_per_plant ;
              ppeo:hasDateTime ?date ;
              ppeo:hasObservedSubject ?ou .    

        ?ou a ppeo:observation_unit ;
            ppeo:hasIdentifier ?ouID ;
            ppeo:partOf ?study ;
            ppeo:hasBiologicalMaterial ?bm .
        ?bm ppeo:hasIdentifier ?bmID .
    }
    GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_1999NL_avg  = result[columnsToKeep]
for c in columnsToKeep:
    datatable_1999NL_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_1999NL_avg.head()
Out[15]:
bmID TubW_T_avg
0 WUR:CE175 1190.325e0
1 WUR:CE296 670.5250000000001e0
2 WUR:CE640 1167.605e0
3 WUR:CE761 1748.4699999999998e0
4 WUR:CE652 1759.3e0

Venezuela 2003 data:

Once again, when building the data table, we will also add a column to hold the genotypes for later use.

In [16]:
# 2003VE data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
    
    SELECT DISTINCT ?filename ?date ?ouID ?bmID ?TubN_average_per_genotype ?TubW_average_per_genotype
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier '2003VE' ;
               ppeo:hasDataFile ?file .
        
        ?file a ppeo:data_file ;
              ppeo:hasDigitalLocation ?filename .
     
        OPTIONAL {?file ppeo:hasObservation ?obs1 .
        ?var1 a ppeo:observed_variable ;
              ppeo:hasIdentifier 'TubN_average_per_genotype' .
        ?obs1 ppeo:hasVariable ?var1 ;
              ppeo:hasDateTime ?date ;
               ppeo:hasObservedSubject ?ou ;
               ppeo:hasValue ?TubN_average_per_genotype .}
            
        OPTIONAL {?file ppeo:hasObservation ?obs2 .
        ?var2 a ppeo:observed_variable ;
              ppeo:hasIdentifier 'TubW_average_per_genotype' .
        ?obs2 ppeo:hasVariable ?var2 ;
              ppeo:hasDateTime ?date ;
               ppeo:hasObservedSubject ?ou ;
               ppeo:hasValue ?TubW_average_per_genotype .}
        
        ?ou a ppeo:observation_unit ;
            ppeo:hasIdentifier ?ouID ;
            ppeo:partOf ?study ;
            ppeo:hasBiologicalMaterial ?bm .
        ?bm ppeo:hasIdentifier ?bmID .
    }
    ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value', 'bmID.value',
                 'TubN_average_per_genotype.value', 'TubW_average_per_genotype.value']
datatable_2003VE = result[columnsToKeep]
for c in columnsToKeep:
    datatable_2003VE.rename(inplace=True, columns={c: c[:-6]})
    
datatable_2003VE.loc[:, 'TubW_average_per_genotype'] = pd.to_numeric(datatable_2003VE['TubW_average_per_genotype'], downcast="float")
datatable_2003VE_perPlant = datatable_2003VE.groupby('bmID')['TubW_average_per_genotype'].mean()

display(datatable_2003VE.head())
display(datatable_2003VE.describe())
filename date ouID bmID TubN_average_per_genotype TubW_average_per_genotype
0 data_2003VE.csv 2003-11-30 WUR:CE-VEN03-CE011 WUR:CE011 1.85e+01 142.5
1 data_2003VE.csv 2003-11-30 WUR:CE-VEN03-CE032 WUR:CE032 8e+00 237.5
2 data_2003VE.csv 2003-11-30 WUR:CE-VEN03-CE047 WUR:CE047 2.1e+01 230.0
3 data_2003VE.csv 2003-11-30 WUR:CE-VEN03-CE067 WUR:CE067 1.15e+01 62.5
4 data_2003VE.csv 2003-11-30 WUR:CE-VEN03-CE070 WUR:CE070 7e+00 125.0
TubW_average_per_genotype
count 107.000000
mean 121.906540
std 110.696785
min 8.000000
25% 56.000000
50% 100.000000
75% 167.250000
max 975.000000

Ethiopia 2010 data

The data for Ethiopia is sparse, so constructing the full table with SPARQL is inconvenient. We only get the tuber weight per plant:

In [17]:
# 2010ET data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>

    SELECT DISTINCT ?filename ?date ?ouID ?TubW_total_per_plant
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier '2010ET' ;
               ppeo:hasDataFile ?file .

        ?file a ppeo:data_file ;
              ppeo:hasDigitalLocation ?filename ;
              ppeo:hasObservation ?obs1 .

        ?var1 a ppeo:observed_variable ;
              ppeo:hasIdentifier 'TubW_total_per_plant' .

        ?obs1 ppeo:hasVariable ?var1 ;
              ppeo:hasValue ?TubW_total_per_plant ;
              ppeo:hasDateTime ?date ;
              ppeo:hasObservedSubject ?ou .    

        ?ou a ppeo:observation_unit ;
            ppeo:hasIdentifier ?ouID ;
            ppeo:partOf ?study.
    }
    ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value', 
                 'TubW_total_per_plant.value']
datatable_2010ET  = result[columnsToKeep]
for c in columnsToKeep:
    datatable_2010ET .rename(inplace=True, columns={c: c[:-6]})
datatable_2010ET.head()
Out[17]:
filename date ouID TubW_total_per_plant
0 data_2010ET.csv 2010-12-01 WUR:CE-ETH10-1-1-1 550
1 data_2010ET.csv 2010-12-01 WUR:CE-ETH10-1-1-2 191
2 data_2010ET.csv 2010-12-01 WUR:CE-ETH10-1-1-3 535
3 data_2010ET.csv 2010-12-01 WUR:CE-ETH10-1-1-4 250
4 data_2010ET.csv 2010-12-01 WUR:CE-ETH10-1-10-1 300

At the same time, we only have the average tuber weight per genotype for the Netherlands and Venezuela. We can calculate the same thing for Ethiopia and Finland. (new table datatable_2010ET2)

In [18]:
# 2010ET data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>

    SELECT DISTINCT  ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier '2010ET' ;
               ppeo:hasDataFile ?file .

        ?file a ppeo:data_file ;
              ppeo:hasDigitalLocation ?filename ;
              ppeo:hasObservation ?obs1 .

        ?var1 a ppeo:observed_variable ;
              ppeo:hasIdentifier 'TubW_total_per_plant' .

        ?obs1 ppeo:hasVariable ?var1 ;
              ppeo:hasValue ?TubW_total_per_plant ;
              ppeo:hasDateTime ?date ;
              ppeo:hasObservedSubject ?ou .    

        ?ou a ppeo:observation_unit ;
            ppeo:hasIdentifier ?ouID ;
            ppeo:partOf ?study ;
            ppeo:hasBiologicalMaterial ?bm .
        ?bm ppeo:hasIdentifier ?bmID .
    }
    GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_2010ET_avg  = result[columnsToKeep]
for c in columnsToKeep:
    datatable_2010ET_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_2010ET_avg.head()
Out[18]:
bmID TubW_T_avg
0 WUR:CE680 143.666666666666666666666666
1 WUR:CE692 446.666666666666666666666666
2 WUR:CE640 177.416666666666666666666666
3 WUR:GUDENE 581.666666666666666666666666
4 WUR:CE761 137.5

Finland 2004 data

The data for Finland is sparse, so constructing the full table with SPARQL is inconvenient. We only get the tuber weight per plant, for block 8 (the one that was harvested last):

In [19]:
# 2004Fin data table
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

    SELECT DISTINCT ?filename ?date ?ouID ?TubW_total_per_plant
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier '2004Fin' ;
               ppeo:hasDataFile ?file .

        ?file a ppeo:data_file ;
              ppeo:hasDigitalLocation ?filename ;
              ppeo:hasObservation ?obs1 .

        ?var1 a ppeo:observed_variable ;
              ppeo:hasIdentifier 'TubW_total_per_plant' .

        ?obs1 ppeo:hasVariable ?var1 ;
              ppeo:hasValue ?TubW_total_per_plant ;
              ppeo:hasDateTime ?date ;
              ppeo:hasObservedSubject ?ou .    

        ?ou a ppeo:observation_unit ;
            ppeo:hasIdentifier ?ouID ;
            ppeo:partOf ?study;
            ppeo:hasSpatialDistribution ?sd .

        ?sd ppeo:hasSpatialDistributionType ?sdtype .
        ?sd ppeo:hasValue "8" .
        ?sdtype ppeo:hasType "block" .
    
    }
    ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value', 
                 'TubW_total_per_plant.value']
datatable_2004Fin  = result[columnsToKeep]
for c in columnsToKeep:
    datatable_2004Fin .rename(inplace=True, columns={c: c[:-6]})

# divide by 3 since there were 3 plants
datatable_2004Fin.loc[:, 'TubW_total_per_plant'] = pd.to_numeric(datatable_2004Fin['TubW_total_per_plant'], downcast="float") / 3
datatable_2004Fin.head()
Out[19]:
filename date ouID TubW_total_per_plant
0 data_2004Fin.csv 2004-09-20 WUR:CE-FIN04-8-1561-A 789.363342
1 data_2004Fin.csv 2004-09-20 WUR:CE-FIN04-8-1561-B 669.080017
2 data_2004Fin.csv 2004-09-20 WUR:CE-FIN04-8-1561-C 587.989990
3 data_2004Fin.csv 2004-09-20 WUR:CE-FIN04-8-1562-A 667.753357
4 data_2004Fin.csv 2004-09-20 WUR:CE-FIN04-8-1562-B 753.156677

Calculating the average tuber weight per genotype:

In [20]:
# 2004Fin data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>

    SELECT DISTINCT  ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier '2004Fin' ;
               ppeo:hasDataFile ?file .

        ?file a ppeo:data_file ;
              ppeo:hasDigitalLocation ?filename ;
              ppeo:hasObservation ?obs1 .

        ?var1 a ppeo:observed_variable ;
              ppeo:hasIdentifier 'TubW_total_per_plant' .

        ?obs1 ppeo:hasVariable ?var1 ;
              ppeo:hasValue ?TubW_total_per_plant ;
              ppeo:hasDateTime ?date ;
              ppeo:hasObservedSubject ?ou .    

        ?ou a ppeo:observation_unit ;
            ppeo:hasIdentifier ?ouID ;
            ppeo:partOf ?study ;
            ppeo:hasBiologicalMaterial ?bm ;
            ppeo:hasSpatialDistribution ?sd .

        ?sd ppeo:hasSpatialDistributionType ?sdtype .
        ?sd ppeo:hasValue "8" .
        ?sdtype ppeo:hasType "block" .
        ?bm ppeo:hasIdentifier ?bmID .
    }
    GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_2004Fin_avg  = result[columnsToKeep]
for c in columnsToKeep:
    datatable_2004Fin_avg.rename(inplace=True, columns={c: c[:-6]})
    
datatable_2004Fin_avg.loc[:, 'TubW_T_avg_perPlant'] = pd.to_numeric(datatable_2004Fin_avg['TubW_T_avg'], downcast="float") / 3
datatable_2004Fin_avg.head()
C:\Python37-32\lib\site-packages\pandas\core\indexing.py:362: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
Out[20]:
bmID TubW_T_avg TubW_T_avg_perPlant
0 WUR:CE175 2784.433333333333e0 928.144470
1 WUR:VOYAGER 1929.8799999999999e0 643.293335
2 WUR:CE640 2592.2066666666665e0 864.068909
3 WUR:CE773 3390.536666666667e0 1130.178833
4 WUR:CE664 2423.9966666666664e0 807.998840

Get plant height data

In [21]:
# 2004Findata - plant height
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = ('''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>

    SELECT DISTINCT ?date ?ouID ?bmID ?plantHeight
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier '2004Fin' ;
               ppeo:hasDataFile ?file .

        ?file a ppeo:data_file ;
              ppeo:hasDigitalLocation ?filename ;
              ppeo:hasObservation ?obs1 .

        ?var1 a ppeo:observed_variable ;
              ppeo:hasIdentifier 'PlantHeight' .

        ?obs1 ppeo:hasVariable ?var1 ;
              ppeo:hasValue ?plantHeight ;
              ppeo:hasDateTime ?date ;
              ppeo:hasObservedSubject ?ou .    

        ?ou a ppeo:observation_unit ;
            ppeo:hasIdentifier ?ouID ;
            ppeo:partOf ?study;
            ppeo:hasBiologicalMaterial ?bm .
        ?bm ppeo:hasIdentifier ?bmID .
    }
    ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['date.value', 'ouID.value', 'bmID.value', 'plantHeight.value']
datatable_2004Fin_ph  = result[columnsToKeep]
for c in columnsToKeep:
    datatable_2004Fin_ph.rename(inplace=True, columns={c: c[:-6]})
datatable_2004Fin_ph.head()

# get mean for each genotype:
datatable_2004Fin_ph2 = datatable_2004Fin_ph[datatable_2004Fin_ph.plantHeight != '*']
datatable_2004Fin_ph2.loc[:, 'plantHeight'] = pd.to_numeric(datatable_2004Fin_ph2['plantHeight'], downcast="float")

# averages per plot:
datatable_2004Fin_ph3 = datatable_2004Fin_ph2.assign(plot=datatable_2004Fin_ph2['ouID'].str[:-2])
datatable_2004Fin_ph3['bmDateOu'] = (datatable_2004Fin_ph3['bmID'] + ';' + datatable_2004Fin_ph3['date'] + 
                                   ';' + datatable_2004Fin_ph3['plot'])
datatable_2004Fin_ph_perPlant = datatable_2004Fin_ph3.groupby('bmDateOu')['plantHeight'].mean()
datatable_2004Fin_ph_perPlant = datatable_2004Fin_ph_perPlant.to_frame().reset_index()
datatable_2004Fin_ph_perPlant[['bmID','Date','plot']] = datatable_2004Fin_ph_perPlant['bmDateOu'].str.split(';',expand=True,)
datatable_2004Fin_ph_perPlant = datatable_2004Fin_ph_perPlant.drop('bmDateOu', 1)

# averages per genotype:
datatable_2004Fin_ph4 = datatable_2004Fin_ph2
datatable_2004Fin_ph4['bmDate'] = (datatable_2004Fin_ph2['bmID'] + ';' + datatable_2004Fin_ph2['date'])
datatable_2004Fin_ph_perPlant4 = datatable_2004Fin_ph4.groupby('bmDate')['plantHeight'].mean()
datatable_2004Fin_ph_perPlant4 = datatable_2004Fin_ph_perPlant4.to_frame().reset_index()
datatable_2004Fin_ph_perPlant4[['bmID','Date']] = datatable_2004Fin_ph_perPlant4['bmDate'].str.split(';',expand=True,)
datatable_2004Fin_ph_perPlant4 = datatable_2004Fin_ph_perPlant4.drop('bmDate', 1)
datatable_2004Fin_ph_perPlant4.loc[:, 'Date'] = pd.to_datetime(datatable_2004Fin_ph_perPlant4['Date'])

datatable_2004Fin_ph_perPlant4['planting'] = pd.to_datetime('2004-06-01')  # planting date: 16 july 2010
datatable_2004Fin_ph_perPlant4['DAP'] = (datatable_2004Fin_ph_perPlant4['Date'] - 
                                        datatable_2004Fin_ph_perPlant4['planting']).dt.days

datatable_2004Fin_ph_perPlant4.head()
Out[21]:
plantHeight bmID Date planting DAP
0 20.500000 WUR:ASTARTE 2004-06-30 2004-06-01 29
1 30.666666 WUR:ASTARTE 2004-07-07 2004-06-01 36
2 42.333332 WUR:ASTARTE 2004-07-13 2004-06-01 42
3 59.666668 WUR:ASTARTE 2004-07-20 2004-06-01 49
4 60.500000 WUR:ASTARTE 2004-07-27 2004-06-01 56

Finland 2005 data

The data for Finland is sparse, so constructing the full table with SPARQL is inconvenient. We only get the tuber weight per plant:

In [22]:
# 2005Fin data table
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>

    SELECT DISTINCT ?filename ?date ?ouID ?TubW_total_per_plant
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier '2005Fin' ;
               ppeo:hasDataFile ?file .

        ?file a ppeo:data_file ;
              ppeo:hasDigitalLocation ?filename ;
              ppeo:hasObservation ?obs1 .

        ?var1 a ppeo:observed_variable ;
              ppeo:hasIdentifier 'TubW_total_per_plant' .

        ?obs1 ppeo:hasVariable ?var1 ;
              ppeo:hasValue ?TubW_total_per_plant ;
              ppeo:hasDateTime ?date ;
              ppeo:hasObservedSubject ?ou .    

        ?ou a ppeo:observation_unit ;
            ppeo:hasIdentifier ?ouID ;
            ppeo:partOf ?study;
            ppeo:hasSpatialDistribution ?sd .

        ?sd ppeo:hasSpatialDistributionType ?sdtype .
        ?sd ppeo:hasValue "IV" .
        ?sdtype ppeo:hasType "block" .
    }
    ORDER BY ASC(?date) ASC(?ouID)
''')
result = query_DB(datatable_query)
columnsToKeep = ['filename.value', 'date.value', 'ouID.value', 
                 'TubW_total_per_plant.value']
datatable_2005Fin  = result[columnsToKeep]
for c in columnsToKeep:
    datatable_2005Fin .rename(inplace=True, columns={c: c[:-6]})
datatable_2005Fin.head()
Out[22]:
filename date ouID TubW_total_per_plant
0 data_2005Fin.csv 2005-09-20 WUR:CE-FIN05-IV-1000-A 3.44e+02
1 data_2005Fin.csv 2005-09-20 WUR:CE-FIN05-IV-1000-B 1.0105e+03
2 data_2005Fin.csv 2005-09-20 WUR:CE-FIN05-IV-1000-C 8.205e+02
3 data_2005Fin.csv 2005-09-20 WUR:CE-FIN05-IV-1001-A 1.184e+03
4 data_2005Fin.csv 2005-09-20 WUR:CE-FIN05-IV-1001-B 5.98e+02

Calculating the average tuber weight per genotype:

In [23]:
# 2005Fin data - averaged
# endpoint: http://localhost:7200/repositories/miappe_20'
datatable_query = (
'''
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>

    SELECT DISTINCT  ?bmID (avg(?TubW_total_per_plant) as ?TubW_T_avg)
    WHERE {
        ?study a ppeo:study ;
               ppeo:hasIdentifier '2005Fin' ;
               ppeo:hasDataFile ?file .

        ?file a ppeo:data_file ;
              ppeo:hasDigitalLocation ?filename ;
              ppeo:hasObservation ?obs1 .

        ?var1 a ppeo:observed_variable ;
              ppeo:hasIdentifier 'TubW_total_per_plant' .

        ?obs1 ppeo:hasVariable ?var1 ;
              ppeo:hasValue ?TubW_total_per_plant ;
              ppeo:hasDateTime ?date ;
              ppeo:hasObservedSubject ?ou .    

        ?ou a ppeo:observation_unit ;
            ppeo:hasIdentifier ?ouID ;
            ppeo:partOf ?study;
            ppeo:hasBiologicalMaterial ?bm ;
            ppeo:hasSpatialDistribution ?sd .

        ?sd ppeo:hasSpatialDistributionType ?sdtype .
        ?sd ppeo:hasValue "IV" .
        ?sdtype ppeo:hasType "block" .
            
        ?bm ppeo:hasIdentifier ?bmID .
    }
    GROUP BY ?bmID
''')
result = query_DB(datatable_query)
columnsToKeep = ['bmID.value', 'TubW_T_avg.value']
datatable_2005Fin_avg  = result[columnsToKeep]
for c in columnsToKeep:
    datatable_2005Fin_avg.rename(inplace=True, columns={c: c[:-6]})
datatable_2005Fin_avg.head()
Out[23]:
bmID TubW_T_avg
0 WUR:CE175 491.5e0
1 WUR:CE296 507.6666666666667e0
2 WUR:VOYAGER 1022.0e0
3 WUR:CE640 857.6666666666666e0
4 WUR:CE761 1151.6666666666667e0

Compare the genotypes that overlap in all experiments

In [24]:
# for average weight per genotype
datatable_2004Fin_avg2 = datatable_2004Fin_avg.drop('TubW_T_avg', axis=1)
finland_merge = pd.merge(datatable_2004Fin_avg2.rename(columns={'TubW_T_avg_perPlant': '2004Fin'}), 
                         datatable_2005Fin_avg.rename(columns={'TubW_T_avg': '2005Fin'}), on= 'bmID')
fin_et = pd.merge(datatable_2010ET_avg.rename(columns={'TubW_T_avg': '2010ET'}), 
                  finland_merge, on='bmID')

fin_et_ven = pd.merge(datatable_2003VE_perPlant, fin_et, on='bmID')
fin_et_ven.rename(columns={'TubW_average_per_genotype': '2003VE'}, inplace=True)

fin_et_ven_nl = pd.merge(datatable_1999NL_perPlant, fin_et_ven, on='bmID')
fin_et_ven_nl.rename(columns={'TubW_total_per_plant': '1999NL'}, inplace=True)
df_averageWeightPerGenotype = fin_et_ven_nl
df_averageWeightPerGenotype[
    ['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
] = df_averageWeightPerGenotype[
    ['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
].apply(pd.to_numeric)
df_averageWeightPerGenotype = df_averageWeightPerGenotype.sort_values('bmID')

display(df_averageWeightPerGenotype.head())
display(df_averageWeightPerGenotype.describe())

# pd.set_option('display.max_rows', 300)
bmID 1999NL 2003VE 2010ET 2004Fin 2005Fin
0 WUR:CE011 887.094971 142.5 254.916667 861.528870 392.166667
1 WUR:CE047 707.325012 230.0 442.083333 944.598877 570.833333
2 WUR:CE067 956.275024 62.5 300.416667 674.260010 713.166667
3 WUR:CE070 1371.640015 125.0 234.090909 678.725525 560.000000
4 WUR:CE100 632.919983 127.5 366.500000 607.843323 1694.500000
1999NL 2003VE 2010ET 2004Fin 2005Fin
count 80.000000 80.000000 80.000000 80.000000 80.000000
mean 1280.952881 126.162498 259.687519 853.342163 764.065000
std 631.450073 116.615913 87.935837 170.252243 313.102156
min 255.299988 17.000000 117.272727 604.811096 266.666667
25% 827.912506 65.625000 198.079545 723.781647 556.875000
50% 1247.385010 100.000000 236.765152 828.277740 705.500000
75% 1671.129944 159.125000 287.916667 971.573898 919.666667
max 3040.670166 975.000000 567.083333 1386.522339 1694.500000
In [25]:
# plotly graph for overlapping genotypes
colors = px.colors.qualitative.Plotly
layout = go.Layout(
    legend=dict(
        x=0, y=1,
        orientation='h',
        title_font_family='Tahoma',
        font=dict(family='Tahoma', size=9, color='black'),
        bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
    ),
    width=1000,
    height=450,
    title='Tuber weight per genotype (averaged by plant)',
    xaxis={'title':'Genotype', 'dtick':1, 'tickangle': 85, 
           'rangemode': 'tozero', 'range': [-1, len(df_averageWeightPerGenotype) + 1]},
    yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
    margin=dict(l=20, r=0, t=40, b=10),
    paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
for (exp, col) in zip(['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET'], colors[:5]):
    fig.add_traces(go.Scatter(x=df_averageWeightPerGenotype['bmID'], 
                              y=df_averageWeightPerGenotype[exp], 
                              mode='lines+markers', line=dict(color=col), name=exp))
fig.show()

Compare all genotypes in all experiments

Scatter plot

In [26]:
# for average weight per genotype
all_datatable_2004Fin_avg2 = datatable_2004Fin_avg.drop('TubW_T_avg', axis=1)
all_finland_merge = pd.merge(all_datatable_2004Fin_avg2.rename(columns={'TubW_T_avg_perPlant': '2004Fin'}), 
                             datatable_2005Fin_avg.rename(columns={'TubW_T_avg': '2005Fin'}), 
                             on= 'bmID', how='outer')
all_fin_et = pd.merge(datatable_2010ET_avg.rename(columns={'TubW_T_avg': '2010ET'}), 
                      all_finland_merge, on='bmID', how='outer')

all_fin_et_ven = pd.merge(datatable_2003VE_perPlant, all_fin_et, on='bmID', how='outer')
all_fin_et_ven.rename(columns={'TubW_average_per_genotype': '2003VE'}, inplace=True)

all_fin_et_ven_nl = pd.merge(datatable_1999NL_perPlant, all_fin_et_ven, on='bmID', how='outer')
all_fin_et_ven_nl.rename(columns={'TubW_total_per_plant': '1999NL'}, inplace=True)
df_averageWeightPerGenotype_all = all_fin_et_ven_nl
df_averageWeightPerGenotype_all[
    ['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
] = df_averageWeightPerGenotype_all[
    ['1999NL', '2003VE', '2010ET', '2004Fin', '2005Fin']
].apply(pd.to_numeric)
df_averageWeightPerGenotype_all = df_averageWeightPerGenotype_all.sort_values('bmID')

display(df_averageWeightPerGenotype_all.head())
display(df_averageWeightPerGenotype_all.describe())

pd.set_option('display.max_rows', 300)
# registered genotypes per study: 
# 2005Fin: 260, 2003VE: 107, 2010ET: 190, 2004Fin: 251, 1999NL: 242
bmID 1999NL 2003VE 2010ET 2004Fin 2005Fin
281 WUR:ASTARTE NaN NaN NaN 662.997742 1451.666667
245 WUR:AWASH NaN NaN 391.727273 NaN NaN
259 WUR:BELETE NaN NaN 552.727273 NaN NaN
250 WUR:BINTJE NaN NaN 257.500000 618.892212 1386.833333
254 WUR:BULLE NaN NaN 739.166667 NaN NaN
1999NL 2003VE 2010ET 2004Fin 2005Fin
count 233.000000 107.000000 188.000000 216.000000 253.000000
mean 1389.825928 121.906540 275.111095 842.187073 732.270290
std 678.495422 110.696785 119.832693 186.580933 315.822943
min 44.004997 8.000000 97.500000 575.451111 130.166667
25% 855.815002 56.000000 197.585859 699.551514 507.666667
50% 1339.489990 100.000000 240.916667 798.835571 646.000000
75% 1781.599976 167.250000 317.250000 966.128601 921.666667
max 3723.739990 975.000000 797.666667 1404.246704 1694.500000
In [27]:
# plotly graph for all genotypes
layout = go.Layout(
    legend=dict(
        x=0, y=1,
        orientation='h',
        title_font_family='Tahoma',
        font=dict(family='Tahoma', size=9, color='black'),
        bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
    ),
    width=1200,  #3200 is a good size for all labels - when dtick = 1
    height=450,
    title='Tuber weight per genotype (averaged by plant)',
    xaxis={'title':'Genotype', 'dtick':3, 'tickangle': 85, 
           'rangemode': 'tozero', 'range': [-2, len(df_averageWeightPerGenotype_all) + 2]},
    yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
    margin=dict(l=20, r=0, t=40, b=10),
    paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
for (exp, col) in zip(['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET'], colors[:5]):
    fig.add_traces(go.Scatter(x=df_averageWeightPerGenotype_all['bmID'], 
                              y=df_averageWeightPerGenotype_all[exp], 
                              mode='markers', line=dict(color=col), name=exp))
fig.show()

Stacked bar graph

In [28]:
# stacked bar graph for all genotypes
df_averageWeightPerGenotype_all_rev = df_averageWeightPerGenotype_all.sort_values('bmID', ascending=False)

# plotly graph for all genotypes
colors = px.colors.qualitative.Plotly
layout = go.Layout(
    legend=dict(
        xanchor='right',
        x=1,
        yanchor='top',
        y=1,
        orientation='v',
        title_font_family='Tahoma',
        font=dict(family='Tahoma', size=9, color='black'),
        bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
    ),
    width=800,  #3200 is a good size for all labels - when dtick = 1
    height=2500,
    title='Tuber weight per genotype (averaged by plant)',
    xaxis={'title': 'plant tuber weight (g)', 
           'dtick':200, 'tickangle': 60, 
           #'rangemode': 'tozero', 'range': [-2, len(df_averageWeightPerGenotype_all) + 2]
          },
    yaxis={'title':'Genotype', 'dtick': 1, 'tickfont':dict(family='Tahoma', size=7, color='black')},
    margin=dict(l=20, r=0, t=40, b=10),
    paper_bgcolor='lightsteelblue',
)

fig = go.Figure(layout=layout)

for (exp, col) in zip(['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET'], colors[:5]):
    fig.add_trace(go.Bar(
        y=df_averageWeightPerGenotype_all_rev['bmID'],
        x=df_averageWeightPerGenotype_all_rev[exp],
        width=0.8,
        name=exp, 
        orientation='h',
        opacity=0.6,
    ))

fig.update_layout(barmode='stack')
fig.show()
fig.write_html('stacked_bars.html')
fig.write_image('stacked_bars.svg')

Making plots combining this trait with the weather information.

Get weather data for all locations

For Netherlands 1999 (df_weather_1999NL):

In [29]:
# 1999NL weather data
datatable_query = (
'''
    PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
    PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
    PREFIX w3ctime: <http://www.w3.org/2006/time#>
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
    PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature 
    WHERE {
        SERVICE <http://localhost:3030/weather/query> {
            ?ws a aemet:WeatherStation ;
                aemet:stationName ?wsName .

            ?obs1 a aemet:Observation ;
                 aemet:valueOfObservedData ?hours_of_sunlight ;
                 ssn:observedBy ?ws;
                 aemet:observedInInterval ?date ;
                 ssn:observedProperty ?var1 .
            ?obs2 a aemet:Observation ;
                 aemet:valueOfObservedData ?mean_temperature ;
                 ssn:observedBy ?ws;
                 aemet:observedInInterval ?date ;
                 ssn:observedProperty ?var2 .

            ?var1 rdfs:label "Hours of sunlight" .
            ?var2 rdfs:label "Mean temperature" .

                #FILTER(?wsName IN('Holeta station'))
                #FILTER(?wsName IN('Merida station'))
                #FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
                #FILTER(?wsName IN('Ruuki station') && 	(?date > "2005-01-01"^^xsd:date))
                FILTER(?wsName IN('WUR station'))
        }
    } ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_1999NL = result[columnsToKeep]
for c in columnsToKeep:
    df_weather_1999NL.rename(inplace=True, columns={c: c[:-6]})
C:\Python37-32\lib\site-packages\pandas\core\frame.py:4025: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

For Venezuela 2003 (df_weather_2003VE):

In [30]:
# 2003VE weather data
datatable_query = (
'''
    PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
    PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
    PREFIX w3ctime: <http://www.w3.org/2006/time#>
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
    PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature 
    WHERE {
        SERVICE <http://localhost:3030/weather/query> {
            ?ws a aemet:WeatherStation ;
                aemet:stationName ?wsName .

            ?obs1 a aemet:Observation ;
                 aemet:valueOfObservedData ?hours_of_sunlight ;
                 ssn:observedBy ?ws;
                 aemet:observedInInterval ?date ;
                 ssn:observedProperty ?var1 .
            ?obs2 a aemet:Observation ;
                 aemet:valueOfObservedData ?mean_temperature ;
                 ssn:observedBy ?ws;
                 aemet:observedInInterval ?date ;
                 ssn:observedProperty ?var2 .

            ?var1 rdfs:label "Hours of sunlight" .
            ?var2 rdfs:label "Mean temperature" .

                #FILTER(?wsName IN('Holeta station'))
                FILTER(?wsName IN('Merida station'))
                #FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
                #FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
                #FILTER(?wsName IN('WUR station'))
        }
    } ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2003VE = result[columnsToKeep]
for c in columnsToKeep:
    df_weather_2003VE.rename(inplace=True, columns={c: c[:-6]})

For Finland 2004 (df_weather_2004Fin):

In [31]:
# 2004Fin weather data
datatable_query = (
'''
    PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
    PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
    PREFIX w3ctime: <http://www.w3.org/2006/time#>
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
    PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature 
    WHERE {
        SERVICE <http://localhost:3030/weather/query> {
            ?ws a aemet:WeatherStation ;
                aemet:stationName ?wsName .

            ?obs1 a aemet:Observation ;
                 aemet:valueOfObservedData ?hours_of_sunlight ;
                 ssn:observedBy ?ws;
                 aemet:observedInInterval ?date ;
                 ssn:observedProperty ?var1 .
            ?obs2 a aemet:Observation ;
                 aemet:valueOfObservedData ?mean_temperature ;
                 ssn:observedBy ?ws;
                 aemet:observedInInterval ?date ;
                 ssn:observedProperty ?var2 .

            ?var1 rdfs:label "Hours of sunlight" .
            ?var2 rdfs:label "Mean temperature" .

                #FILTER(?wsName IN('Holeta station'))
                #FILTER(?wsName IN('Merida station'))
                FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
                #FILTER(?wsName IN('Ruuki station') && 	(?date > "2005-01-01"^^xsd:date))
                #FILTER(?wsName IN('WUR station'))
        }
    } ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2004Fin = result[columnsToKeep]
for c in columnsToKeep:
    df_weather_2004Fin.rename(inplace=True, columns={c: c[:-6]})
df_weather_2004Fin.head()
Out[31]:
wsName date hours_of_sunlight mean_temperature
0 Ruuki station 2004-06-01 19.71 10.0
1 Ruuki station 2004-06-02 19.79 10.2
2 Ruuki station 2004-06-03 19.86 12.2
3 Ruuki station 2004-06-04 19.93 12.0
4 Ruuki station 2004-06-05 20.0 13.6

For Finland 2005 (df_weather_2005Fin):

In [32]:
# 2005Fin weather data
datatable_query = (
'''
    PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
    PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
    PREFIX w3ctime: <http://www.w3.org/2006/time#>
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
    PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature 
    WHERE {
        SERVICE <http://localhost:3030/weather/query> {
            ?ws a aemet:WeatherStation ;
                aemet:stationName ?wsName .

            ?obs1 a aemet:Observation ;
                 aemet:valueOfObservedData ?hours_of_sunlight ;
                 ssn:observedBy ?ws;
                 aemet:observedInInterval ?date ;
                 ssn:observedProperty ?var1 .
            ?obs2 a aemet:Observation ;
                 aemet:valueOfObservedData ?mean_temperature ;
                 ssn:observedBy ?ws;
                 aemet:observedInInterval ?date ;
                 ssn:observedProperty ?var2 .

            ?var1 rdfs:label "Hours of sunlight" .
            ?var2 rdfs:label "Mean temperature" .

                #FILTER(?wsName IN('Holeta station'))
                #FILTER(?wsName IN('Merida station'))
                #FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
                FILTER(?wsName IN('Ruuki station') && (?date > "2005-01-01"^^xsd:date))
                #FILTER(?wsName IN('WUR station'))
        }
    } ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2005Fin = result[columnsToKeep]
for c in columnsToKeep:
    df_weather_2005Fin.rename(inplace=True, columns={c: c[:-6]})
df_weather_2005Fin.head()
Out[32]:
wsName date hours_of_sunlight mean_temperature
0 Ruuki station 2005-05-16 18.23 4.8
1 Ruuki station 2005-05-17 18.33 9.0
2 Ruuki station 2005-05-18 18.43 7.0
3 Ruuki station 2005-05-19 18.53 4.9
4 Ruuki station 2005-05-20 18.62 8.5

For Ethiopia 2010 (df_weather_2010ET):

In [33]:
# 2010ET weather data
datatable_query = (
'''
    PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
    PREFIX ssn: <http://purl.oclc.org/NET/ssnx/ssn#>
    PREFIX w3ctime: <http://www.w3.org/2006/time#>
    PREFIX ppeo: <http://purl.org/ppeo/PPEO.owl#>
    PREFIX aemet: <http://aemet.linkeddata.es/ontology/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    SELECT distinct ?wsName ?date ?hours_of_sunlight ?mean_temperature 
    WHERE {
        SERVICE <http://localhost:3030/weather/query> {
            ?ws a aemet:WeatherStation ;
                aemet:stationName ?wsName .

            ?obs1 a aemet:Observation ;
                 aemet:valueOfObservedData ?hours_of_sunlight ;
                 ssn:observedBy ?ws;
                 aemet:observedInInterval ?date ;
                 ssn:observedProperty ?var1 .
            ?obs2 a aemet:Observation ;
                 aemet:valueOfObservedData ?mean_temperature ;
                 ssn:observedBy ?ws;
                 aemet:observedInInterval ?date ;
                 ssn:observedProperty ?var2 .

            ?var1 rdfs:label "Hours of sunlight" .
            ?var2 rdfs:label "Mean temperature" .

                FILTER(?wsName IN('Holeta station'))
                #FILTER(?wsName IN('Merida station'))
                #FILTER(?wsName IN('Ruuki station') && (?date < "2005-01-01"^^xsd:date))
                #FILTER(?wsName IN('Ruuki station') && 	(?date > "2005-01-01"^^xsd:date))
                #FILTER(?wsName IN('WUR station'))
        }
    } ORDER BY ASC(?wsName) ASC(?date)
''')
result = query_DB(datatable_query)
columnsToKeep = ['wsName.value', 'date.value', 'hours_of_sunlight.value', 'mean_temperature.value']
df_weather_2010ET = result[columnsToKeep]
for c in columnsToKeep:
    df_weather_2010ET.rename(inplace=True, columns={c: c[:-6]})
df_weather_2010ET.head()
Out[33]:
wsName date hours_of_sunlight mean_temperature
0 Holeta station 2010-07-17 12.59 13.875
1 Holeta station 2010-07-18 12.59 14.5625
2 Holeta station 2010-07-19 12.58 14.35
3 Holeta station 2010-07-20 12.58 13.4625
4 Holeta station 2010-07-21 12.57 13.1375

Calculate (cumulative) photo-beta thermal time

Beta thermal days: $\Large g \left ( T \right) = \left [ \left (\frac{T_c - T}{T_c-T_0} \right ) \cdot \left (\frac{T - T_b}{T_0-T_b} \right ) ^{\frac{T_0 - T_b}{T_c - T_0}} \right ]^{c_t}$

In [34]:
# calculate PBTT
df_weather_2003VE, cumPBTT_2003VE = calculate_PBTT(df_weather_2003VE)
df_weather_2004Fin, cumPBTT_2004Fin = calculate_PBTT(df_weather_2004Fin)
df_weather_2005Fin, cumPBTT_2005Fin = calculate_PBTT(df_weather_2005Fin)
df_weather_2010ET, cumPBTT_2010ET = calculate_PBTT(df_weather_2010ET)
df_weather_1999NL, cumPBTT_1999NL = calculate_PBTT(df_weather_1999NL)
In [35]:
# plotly graph for overlapping genotypes
layout = go.Layout(
    width=500,
    height=300,
    title='Cumulative photo-beta thermal time per study',
    xaxis={'title':'study', 'dtick':1, 'tickangle': 45, },
    yaxis={'title':'cumulative PBTT', 'dtick': 10},
    margin=dict(l=20, r=10, t=40, b=10),
    paper_bgcolor='lightsteelblue',
)
studies = ['1999NL', '2003VE', '2004Fin', '2005Fin', '2010ET']
values = [cumPBTT_1999NL, cumPBTT_2003VE, cumPBTT_2004Fin, cumPBTT_2005Fin, cumPBTT_2010ET]
values = [round(v,2) for v in values]
cpbtt = {'1999NL': values[0], '2003VE': values[1], '2004Fin': values[2], 
         '2005Fin': values[3], '2010ET': values[4]}
fig = go.Figure(layout=layout)
fig.add_trace(go.Bar(
    x=studies,
    y=[cpbtt[k] for k in studies],
    text=[round(v,2) for v in values], 
    textposition='auto',
    marker_color=colors[:5],
    opacity = 0.8
))
fig.show()

paula_cpbtt = {'1999NL': 48.75, '2003VE': 22.35, '2004Fin': 32.44, '2005Fin': 40.85, '2010ET': 24.62}
print('Cumulative PBTT differences from Paula\'s (Paula\'s - calculated here):')
for (s, pVal, eVal) in zip(studies, [paula_cpbtt[k] for k in studies], values):
    print('For {:>7}, {:4.2f} - {:4.2f} = {:+4.2f}'.format(s, pVal, eVal, round(pVal - eVal,2)))
Cumulative PBTT differences from Paula's (Paula's - calculated here):
For  1999NL, 48.75 - 56.18 = -7.43
For  2003VE, 22.35 - 23.11 = -0.76
For 2004Fin, 32.44 - 31.31 = +1.13
For 2005Fin, 40.85 - 39.30 = +1.55
For  2010ET, 24.62 - 24.13 = +0.49

Find location with best and worst performance for each genotype

In [36]:
# add columns to df, with values and index of min / max 
df_performance = df_averageWeightPerGenotype_all.set_index('bmID', inplace=False)
minCol = df_performance.idxmin(axis=1)
df_performance['max_study'] = df_performance.idxmax(axis=1)
df_performance['max_val'] = df_performance.max(axis=1)
df_performance['min_val'] = df_performance.min(axis=1)
df_performance['min_study'] = minCol

#list(df_performance.columns.values) - sorted in order of ascending cPBTT: 
# ['2003VE', '2010ET',  '2004Fin', '2005Fin', '1999NL']
df_perf_ordered = df_performance[['2003VE', '2010ET',  '2004Fin', '2005Fin', '1999NL']]
In [37]:
# plotly graph for performance
layout = go.Layout(
#    legend=dict(
#        x=0, y=1,
#        orientation='h',
#        title_font_family='Tahoma',
#        font=dict(family='Tahoma', size=9, color='black'),
#        bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
#    ),
    width=1000,
    height=450,
    title='Tuber weight per genotype (averaged by plant)',
    xaxis={'title':'cumulative PBTT', 'tickangle': 85},
    yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
    margin=dict(l=20, r=0, t=40, b=10),
    paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)

for index, row in df_performance.iterrows():
    if cpbtt[row['min_study']] > cpbtt[row['max_study']]:
        col = 'darkred'
    else:
        col= 'darkgreen'
    label = index + ': ' + row['max_study'] + '-' + row['min_study']
    fig.add_traces(go.Scatter(x=[cpbtt[row['min_study']], cpbtt[row['max_study']]], 
                              y=[row['min_val'],row['max_val']],
                              line=dict(color=col),
                              name=label
                              )) 
fig.show()
In [38]:
# plotly graph for performance
layout = go.Layout(
#    legend=dict(
#        x=0, y=1,
#        orientation='h',
#        title_font_family='Tahoma',
#        font=dict(family='Tahoma', size=9, color='black'),
#        bgcolor='lightsteelblue', bordercolor='grey', borderwidth=1
#    ),
    width=1000,
    height=450,
    title='Tuber weight per genotype (averaged by plant)',
    xaxis={'title':'cumulative PBTT', 'tickangle': 85},
    yaxis={'title':'plant tuber weight (g)', 'dtick': 200},
    margin=dict(l=20, r=0, t=40, b=10),
    paper_bgcolor='lightsteelblue',
)
fig = go.Figure(layout=layout)
studies_ordered = ['2003VE', '2010ET',  '2004Fin', '2005Fin', '1999NL']
cpbtt_labels = [cpbtt[s] for s in studies_ordered]

at_least_2 = []
at_least_3 = []
at_least_4 = []
at_least_5 = []
points = []
monotonic_counter = 0
for index, row in df_performance.iterrows():
    values_ordered = [row[s] for s in studies_ordered]
    comparable_values = [v for v in values_ordered if not pd.isnull(v)]
    if monotonic(comparable_values):
        col = 'darkgreen'
        if len(comparable_values) > 4: at_least_5.append(index)
        elif len(comparable_values) > 3: at_least_4.append(index)
        elif len(comparable_values) > 2: at_least_3.append(index)
        elif len(comparable_values) > 1: at_least_2.append(index)
        else: points.append(index)
        monotonic_counter += 1
    else:
        col= 'darkred'
        
    label = index #+ ': ' + row['max_study'] + '-' + row['min_study']
    fig.add_traces(go.Scatter(x=cpbtt_labels, 
                              y=values_ordered,
                              line=dict(color=col),
                              name=label
                              )) 
fig.show()
In [39]:
print('1: ' + str(len(points)))
print('2: ' + str(len(at_least_2)))
print('3: ' + str(len(at_least_3)))
print('4: ' + str(len(at_least_4)) + ' ' + str(at_least_4))
print('5: ' + str(len(at_least_5)))

print(monotonic_counter)
1: 38
2: 36
3: 27
4: 23 ['WUR:CE171', 'WUR:CE277', 'WUR:CE298', 'WUR:CE350', 'WUR:CE615', 'WUR:CE628', 'WUR:CE632', 'WUR:CE647', 'WUR:CE651', 'WUR:CE665', 'WUR:CE666', 'WUR:CE675', 'WUR:CE678', 'WUR:CE687', 'WUR:CE696', 'WUR:CE699', 'WUR:CE720', 'WUR:CE723', 'WUR:CE724', 'WUR:CE761', 'WUR:CE765', 'WUR:CE776', 'WUR:CE778']
5: 26
150
In [ ]: